Exogenous RNA Coverage Plots

Author

Lance Parsons

Load libraries

This project uses renv to keep track of installed packages. Install renv if not installed and load dependencies with renv::restore().

install.packages("renv")
renv::restore()

Read sample data

Get list of samples

Code
samples <- read_tsv("config/samples.tsv",
  col_types = list(
    sample_name = col_character(),
    cell_line = col_factor(),
    exogenous_rna = col_factor(),
    day = col_factor()
  )
)
units <- read_tsv("config/units.tsv",
  col_types = list(
    sample_name = col_character(),
    unit_name = col_character(),
    fq1 = col_character(),
    fq2 = col_character()
  )
)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE) %>%
  mutate(
    day = as.factor(paste0("day", day)),
    batch = as.factor(case_when(
      exogenous_rna == "mastermix1" ~ "batch3",
      exogenous_rna == "mastermix2" ~ "batch3",
      TRUE ~ "batch2"
    )),
    .before = cell_line,
  )

sample_units

Table of exogenous RNA mixtures

Code
# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2", "PJY103_mDNMT1", "PJY300_mDNMT1")) {
  t <- Biostrings::readDNAStringSet(sprintf("data/references/%s.fa", mix))
  rna_mixes <- rbind(rna_mixes, tibble(
    exogenous_rna = mix,
    rna_species = word(t@ranges@NAMES, 1),
    start = 1,
    end = t@ranges@width
  ))
}

rna_mixes <- rna_mixes %>%
  mutate(
    active_start_min = case_when(
      exogenous_rna == "mastermix1" ~ 11,
      exogenous_rna == "mastermix2" ~ 11,
      exogenous_rna == "PJY103_mDNMT1" ~ 11,
      exogenous_rna == "PJY300_mDNMT1" ~ 11
    ),
    active_start_max = case_when(
      exogenous_rna == "mastermix1" ~ 14,
      exogenous_rna == "mastermix2" ~ 14,
      exogenous_rna == "PJY103_mDNMT1" ~ 15,
      exogenous_rna == "PJY300_mDNMT1" ~ 15
    ),
    cyptic_terminator_end = case_when(
      exogenous_rna == "mastermix1" ~ 37,
      exogenous_rna == "mastermix2" ~ 37,
      exogenous_rna == "PJY103_mDNMT1" ~ 38,
      exogenous_rna == "PJY300_mDNMT1" ~ 38
    ),
  )

rna_mixes

Exogenous RNA counts

BAM reading functions

Code
rna_species_plot_range <- function(sequence_name) {
  plot_range <- rna_mixes %>%
    dplyr::filter(.data$rna_species == sequence_name) %>%
    dplyr::select(start, end) %>%
    unique()
  return(plot_range)
}

# GRanges from BAM
granges_from_bam <- function(sample_unit,
                             sequence_name,
                             is_proper_pair = TRUE,
                             bam_dir =
                               "results/alignments/exogenous_rna/sorted") {
  plot_range <- rna_species_plot_range(sequence_name)
  which <- GRanges(
    sprintf("%s:%i-%i", sequence_name, plot_range$start, plot_range$end)
  )

  param <- ScanBamParam(
    flag = scanBamFlag(isProperPair = is_proper_pair),
    mapqFilter = 1,
    which = which
  )

  aligned_fragments_list <- list()

  if (is_proper_pair) {
    aligned_fragments <- granges(
      readGAlignmentPairs(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      ),
      on.discordant.seqnames = "drop"
    )
    rna_info <- rna_mixes %>%
      filter(rna_species == {{ sequence_name }}) %>%
      select(-.data$exogenous_rna) %>%
      unique()

    aligned_fragments_list$active <- aligned_fragments %>%
      plyranges::filter(
        start <= rna_info$active_start_max,
        end >= rna_info$cyptic_terminator_end
      )
    aligned_fragments_list$inactive <- aligned_fragments %>%
      plyranges::filter(
        start > rna_info$active_start_max,
        end >= rna_info$cyptic_terminator_end
      )
    aligned_fragments_list$premature_termination <- aligned_fragments %>%
      plyranges::filter(end < rna_info$cyptic_terminator_end)
  } else {
    aligned_fragments <- granges(
      readGAlignments(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      )
    )
    aligned_fragments_list$discordant <- aligned_fragments
  }
  return(aligned_fragments_list)
}

Read BAM coverage

Code
rna_species_plot_data <- list()
for (rna_species_to_plot in rna_mixes$rna_species) {
  rna_info <- rna_mixes %>%
    dplyr::filter(rna_species == rna_species_to_plot)

  sample_units_to_plot <- sample_units %>%
    dplyr::filter(exogenous_rna %in% rna_info$exogenous_rna)

  granges_to_plot <- list()
  for (sample_unit in sample_units_to_plot$sample_unit) {
    granges_to_plot[[sample_unit]] <- granges_from_bam(
      sample_unit,
      rna_species_to_plot,
      TRUE
    )
  }
  rna_species_plot_data[[rna_species_to_plot]] <- granges_to_plot
}

Organize coverage data

Code
exogenous_rna_count_data <- tibble(
  sample_unit = character(),
  sequence_name = character(),
  category = character(),
  count = numeric()
)
for (rna_species in names(rna_species_plot_data)) {
  for (sample_unit in names(rna_species_plot_data[[rna_species]])) {
    for (category in
      names(rna_species_plot_data[[rna_species]][[sample_unit]])) {
      exogenous_rna_count_data <- exogenous_rna_count_data %>%
        add_row(
          sample_unit = sample_unit,
          sequence_name = rna_species,
          category = category,
          count = length(
            rna_species_plot_data[[rna_species]][[sample_unit]][[category]]
          )
        )
    }
  }
}
exogenous_rna_count_data

Summarize coverage data

Code
exogenous_rna_mapped_totals <- exogenous_rna_count_data %>%
  group_by(sample_unit, sequence_name) %>%
  summarize(mapped_fragments = sum(count))
`summarise()` has grouped output by 'sample_unit'. You can override using the
`.groups` argument.
Code
exogenous_rna_mapped_totals

Coverage plots

Code
# Colors for each series
colors <- list(
  "Parental" <- list( # nolint: object_name_linter
    "active" = "#800000",
    "inactive" = "#b4604e",
    "premature_termination" = "#dfaea3"
  ),
  "P1E10" <- list( # nolint: object_name_linter
    "active" = "#003869",
    "inactive" = "#637499",
    "premature_termination" = "#b0b7cb"
  )
)

# Loop over species and plot coverage
for (rna_species in rna_mixes %>%
  pull(rna_species) %>%
  unique()) {
  gtrack <- GenomeAxisTrack()

  # Read FASTA sequence and trim off description (after space)
  dna <- readDNAStringSet(
    sprintf("data/references/exogenous-rna/%s.fa", rna_species)
  )
  names(dna) <- names(dna) %>% str_remove(" +.*$")
  sequence_track <- SequenceTrack(dna,
    genome = rna_species,
    chromosome = rna_species,
    cex = 0.5
  )

  # Get list of sample units we will plot
  sample_units_to_plot <- sample_units %>%
    filter(sample_unit %in% names(rna_species_plot_data[[rna_species]]))

  # Loop over days to create two overlay tracks
  day_tracks <- list()
  day_ylims <- list()
  days_to_plot <- sample_units_to_plot %>%
    pull(day) %>%
    levels()
  for (day in days_to_plot) {
    # Get sample units for day
    sample_units_for_day <- sample_units_to_plot %>%
      filter(day == {{ day }}) %>%
      pull(sample_unit)

    # Loop over sample units for this day
    data_tracks <- list()
    for (sample_unit in sample_units_for_day) {
      cell_line <- sample_units_to_plot %>%
        filter(sample_unit == {{ sample_unit }}) %>%
        pull(cell_line)
      for (category in
        names(rna_species_plot_data[[rna_species]][[sample_unit]])) {
        # Calculate coverage
        data <- as(
          coverage(
            rna_species_plot_data[[rna_species]][[sample_unit]][[category]]
          ),
          "GRanges"
        ) %>%
          # Remove non-matching sequences
          keepSeqlevels(rna_species, pruning.mode = "coarse")

        # Normalize scores
        norm_factor <- exogenous_rna_mapped_totals %>%
          filter(
            sample_unit == {{ sample_unit }},
            sequence_name == {{ rna_species }}
          ) %>%
          pull(mapped_fragments)
        score(data) <- score(data) / norm_factor

        # Create GViz data track
        data_track <- DataTrack(data,
          name = day,
          type = "l",
          col = colors[[cell_line]][[category]],
          strand = "*"
        )
        data_tracks <- c(data_tracks, data_track)
      }
    } # end sample_unit loop
    day_tracks[[day]] <- OverlayTrack(trackList = data_tracks, name = day)
    day_ylims[[day]] <- ylims <- extendrange(range(lapply(data_tracks, values)))
  } # end day loop

  start <- rna_mixes %>%
    filter(rna_species == {{ rna_species }}) %>%
    pull(start) %>%
    unique()
  end <- rna_mixes %>%
    filter(rna_species == {{ rna_species }}) %>%
    pull(end) %>%
    unique()

  annotations <- read_tsv(
    sprintf("data/references/exogenous-rna/%s.bed", rna_species),
    col_names = c("chrom", "chromStart", "chromEnd", "name"),
    col_types = "ciic"
  )

  annotations_positions <- annotations %>%
    dplyr::filter(name %in% c(
      "cryptic_terminator_end", "sgRNA_end", "edit",
      "nick", "pegRNA_end", "terminator_end"
    ))

  annotation_track <- AnnotationTrack(
    chromosome = rna_species,
    start = annotations$chromStart + 1,
    end = annotations$chromEnd,
    id = annotations$name,
    fill = "#cfafc3",
    featureAnnotation = "id",
    fontcolor.feature = "#666666",
    name = "(e)pegRNA features"
  )

  ht <- HighlightTrack(
    trackList = c(sequence_track, annotation_track, day_tracks),
    start = annotations_positions$chromStart + 1,
    end = annotations_positions$chromEnd,
    chromosome = rna_species,
    col = "darkgrey",
    fill = "#FFFFFF00",
    lwd = 0.5
  )

  plotTracks(c(gtrack, ht),
    chromosome = rna_species,
    from = start,
    to = end,
    ylim = c(0, max(unlist(day_ylims))),
    main = rna_species,
    lwd = 3
  )
}